#### setting environment ####
require(readstata13)
require(lmtest)
require(multiwayvcov)
require(MASS)

#### data preparation ####
Reed.Smith.data <- read.dta13("Reed-Smith-JHRED-CANDIDATES.dta")

# candidates running for SMD seats in the 1996-2014 general elections
SMD.data <- subset(Reed.Smith.data, yr %in% 19:25 & kucode != 0)
# dummy variable for whether a candidate was elected in SMD
SMD.data$SMD.win <- (SMD.data$result == 1) * 1
# logged number of candidates in a district
SMD.data$log.ncand <- log(SMD.data$ku_ncand)
# dummy variable for incumbency
SMD.data$incumbent <- (SMD.data$inc > 0) * 1
# logged number of winning HOR elections
SMD.data$log.wins <- log(SMD.data$totcwins - (SMD.data$result > 0) * 1 + 1)
# district id
SMD.data$district.id <- as.numeric(factor(SMD.data$year * 10000 + SMD.data$kucode))
# dummy variables for open seats and newcomers' races
SMD.data$open.seat <- SMD.data$all.fresh <- 0
for (i in 1:max(SMD.data$district.id)) {
  district.subset.data <- subset(SMD.data, district.id == i)
  SMD.data$open.seat[SMD.data$district.id == i] <- (sum(district.subset.data$inc) == 0) * 1
  SMD.data$all.fresh[SMD.data$district.id == i] <- (sum(district.subset.data$log.wins > 0) == 0) * 1
}

# leave only LDP and DPJ candidates
two.party.data <- subset(SMD.data, party_id == 1 | party_id == 16)
# dummy variable for DPJ affiliation
two.party.data$DPJ <- (two.party.data$party_id == 16) * 1

#### logistic regressions (Table A.12) ####
## entire sample
# logistic regression
glm.result.1 <- glm(SMD.win ~ female + age + I(age ^ 2 / 100) + log.ncand + incumbent + log.wins + DPJ * factor(yr) - 1, 
                    data = two.party.data, x = TRUE)
# testing coefficients with the robust standard error
coeftest.result.1 <- coeftest(glm.result.1, 
                              vcov = cluster.vcov(glm.result.1, 
                                                  cluster = ~ pid + district.id))
# summary
round(coeftest.result.1, 3)
# number of observations
length(glm.result.1$residuals)

## open seats
# logistic regression
glm.result.2 <- glm(SMD.win ~ female + age + I(age ^ 2 / 100) + log.ncand + DPJ * factor(yr), 
                    data = two.party.data, subset = open.seat == 1, x = TRUE)
# testing coefficients with the robust standard error
coeftest.result.2 <- coeftest(glm.result.2, 
                              vcov = cluster.vcov(glm.result.2, 
                                                  cluster = ~ pid + district.id))
# summary
round(coeftest.result.2, 3)
# number of observations
length(glm.result.2$residuals)

# newcomers' races
# logistic regression
glm.result.3 <- glm(SMD.win ~ female + age + I(age ^ 2 / 100) + log.ncand + DPJ * factor(yr), 
                    data = two.party.data, subset = all.fresh == 1, x = TRUE)
# testing coefficients with the robust standard error
coeftest.result.3 <- coeftest(glm.result.3, 
                              vcov = cluster.vcov(glm.result.3, 
                                                  cluster = ~ pid + district.id))
# summary
round(coeftest.result.3, 3)
# number of observations
length(glm.result.3$residuals)

#### quasi-Bayesian Monte Carlo simulations (Table A.13) ####
## function for quasi-Bayesian Monte Carlo simulations
logit.simulation <- function(result,  # glm object
                             vcv,  # variance-covariance matrix
                             prob = 0.95,  # level of confidence interval
                             sim = 1000,  # number of simulations
                             seed = 12345)  # initial seed for variate generators
  {
  coef.est <- result$coefficients
  set.seed(seed)
  coef.draw <- mvrnorm(sim, coef.est, vcv)
  new.matrix.1 <- result$x
  new.matrix.1[, "female"] <- 0
  prob.1 <- exp(tcrossprod(coef.draw, new.matrix.1)) / (1 + exp(tcrossprod(coef.draw, new.matrix.1)))
  new.matrix.2 <- result$x
  new.matrix.2[, "female"] <- 1
  prob.2 <- exp(tcrossprod(coef.draw, new.matrix.2)) / (1 + exp(tcrossprod(coef.draw, new.matrix.2)))
  prob.diff <- prob.1 - prob.2
  sim.result <- matrix(NA, 3, 3)
  rownames(sim.result) <- c("male", "female", "difference")
  colnames(sim.result) <- c("mean", "lower", "upper")
  sim.result[1, 1] <- mean(prob.1)
  sim.result[1, 2:3] <- quantile(rowMeans(prob.1), probs = c((1 - prob) / 2, 1 - ((1 - prob) / 2)))
  sim.result[2, 1] <- mean(prob.2)
  sim.result[2, 2:3] <- quantile(rowMeans(prob.2), probs = c((1 - prob) / 2, 1 - ((1 - prob) / 2)))
  sim.result[3, 1] <- mean(prob.diff)
  sim.result[3, 2:3] <- quantile(rowMeans(prob.diff), probs = c((1 - prob) / 2, 1 - ((1 - prob) / 2)))
  sim.result
}

## entire sample
# simulation
vcv.sim.1 <- cluster.vcov(glm.result.1, 
                          cluster = ~ pid + district.id, force_posdef = TRUE)
# summary
round(logit.simulation(glm.result.1, vcv.sim.1), 3)

## open seats
# simulation
vcv.sim.2 <- cluster.vcov(glm.result.2, 
                          cluster = ~ pid + district.id, force_posdef = TRUE)
# summary
round(logit.simulation(glm.result.2, vcv.sim.2), 3)

## newcomers' races
# simulation
vcv.sim.3 <- cluster.vcov(glm.result.3, 
                          cluster = ~ pid + district.id, force_posdef = TRUE)
# summary
round(logit.simulation(glm.result.3, vcv.sim.3), 3)